In [51]:
from collections import Counter
import os
import re
import sys
import time

from cltk.corpus.utils.formatter import assemble_phi5_works_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tag.pos import POSTag
from nltk.tokenize.punkt import PunktLanguageVars
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
def works_texts_list(rm_punctuation, rm_periods):
    fps = assemble_phi5_works_filepaths()
    curly_comp = re.compile(r'{.+?}')
    _list = []
    for fp in fps:
        with open(fp) as fo:
            fr = fo.read()
        text = phi5_plaintext_cleanup(fr, rm_punctuation, rm_periods)
        text = curly_comp.sub('', text)
        _list.append(text)
    return _list

In [6]:
t0 = time.time()
text_list = works_texts_list(rm_punctuation=True, rm_periods=True)
print('Total texts', len(text_list))
print('Time to build list of texts: {}'.format(time.time() - t0))


Total texts 836
Time to build list of texts: 81.57707095146179

Bag of words, indivudual word count


In [5]:
# bag of words/word count
def bow_csv():
    t0 = time.time()
    vectorizer = CountVectorizer(min_df=1)
    column_names = ['wc_' + w for w in vectorizer.get_feature_names()]
    term_document_matrix = vectorizer.fit_transform(text_list)
    dataframe_bow = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
    print('DF BOW shape', dataframe_bow.shape)

    fp = os.path.expanduser('~/cltk_data/user_data/bow_latin.csv')
    dataframe_bow.to_csv(fp)
    print('Time to create BOW vectorizer and write csv: {}'.format(time.time() - t0))

In [6]:
#bow_csv()

tf-idf


In [7]:
# tf-idf
def tfidf_csv():
    t0 = time.time()
    vectorizer = TfidfVectorizer(min_df=1)
    column_names = ['tfidf_' + w for w in vectorizer.get_feature_names()]
    term_document_matrix = vectorizer.fit_transform(text_list)
    dataframe_tfidf = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
    print('DF tf-idf shape', dataframe_tfidf.shape)
    
    fp = os.path.expanduser('~/cltk_data/user_data/tfidf_latin.csv')
    dataframe_tfidf.to_csv(fp)
    print('Time to create tf-idf vectorizer and write csv: {}'.format(time.time() - t0))

In [1]:
#tfidf_csv()

Character, simple word, and sentence counts


In [5]:
# char count
# word count
# sentence
# word count lens

In [12]:
def char_len():
    """Count char len in an input string (doc)."""
    t0 = time.time()
    char_len = {}
    for i, doc in enumerate(text_list):
        char_len[i] = pd.Series(len(doc), index=['char_len'])
    df_char_len = pd.DataFrame(char_len).transpose()

    fp = os.path.expanduser('~/cltk_data/user_data/char_len_latin.csv')
    df_char_len.to_csv(fp)
    print('Time to create doc len counts: {}'.format(time.time() - t0))

char_len()


Time to create doc len counts: 0.24905800819396973

In [16]:
def word_count():
    """Count words in an input string (doc)."""
    t0 = time.time()
    p = PunktLanguageVars()
    word_count = {}
    for i, doc in enumerate(text_list):
        wc_doc = len(p.word_tokenize(doc))
        word_count[i] = pd.Series(wc_doc, index=['word_count'])
    df_word_count = pd.DataFrame(word_count).transpose()

    fp = os.path.expanduser('~/cltk_data/user_data/word_count_lens_latin.csv')
    df_word_count.to_csv(fp)
    print('Time to create doc word count: {}'.format(time.time() - t0))

word_count()


Time to create doc word count: 11.002326011657715

In [34]:
text_list_no_cleanup = works_texts_list(rm_punctuation=False, rm_periods=False)

In [14]:
text_list_no_cleanup[1][:500]


Out[14]:
'  Maecenas atavis edite regibvs. Hac ode Maecenatem adloquitur indicans alium alio studio teneri rerum, quae adpetantur uel ludicri cupiditate uel gloriae; se autem putare inter deos relatuiri, si numero lyricorum poetarum adscriptus fuerit. Maecenatem ait atauis regibus editum, quod a nobilibus Etruscorum ortus sit. Palmaqve nobilis terrarvm dominos evehit ad deos. Ambiguum, utrum nobilis deos an nobilis palma. Mobiles quirites ait referens ad uulgi leuitatem. Loquitur autem de eo, qui fauorem '

In [5]:
# see how sent tokenizer works
s = ' ex scriptis eorum qui ueri arbitrantur . . . neque ipsi eos alii modi esse atque Amilcar dixit, ostendere possunt aliter. antequam Barcha perierat, alii rei causa in Africam missus . . . . . . tantum bellum suscitare conari aduersarios contra bellosum genus. qui cum is ita foedus icistis . . . . . . cum iure sine periculo bellum geri poteratur. qui intellegunt quae fiant, dissentiuntur. Legati quo missi sunt ueniunt, dedicant mandata. Saguntinorum Sempronius Lilybaeo celocem in Africam mittit u'
tokenizer = TokenizeSentence('latin')
sent_tokens = tokenizer.tokenize_sentences(s)
sent_tokens = [s for s in sent_tokens if len(s) > 1]  # rm '.' sents
sent_tokens


Out[5]:
[' ex scriptis eorum qui ueri arbitrantur .',
 'neque ipsi eos alii modi esse atque Amilcar dixit, ostendere possunt aliter.',
 'antequam Barcha perierat, alii rei causa in Africam missus .',
 'tantum bellum suscitare conari aduersarios contra bellosum genus.',
 'qui cum is ita foedus icistis .',
 'cum iure sine periculo bellum geri poteratur.',
 'qui intellegunt quae fiant, dissentiuntur.',
 'Legati quo missi sunt ueniunt, dedicant mandata.',
 'Saguntinorum Sempronius Lilybaeo celocem in Africam mittit u']

In [9]:
def sentence_count():
    """Count sentence in an input string (doc)."""
    t0 = time.time()
    tokenizer = TokenizeSentence('latin')
    word_count = {}
    for i, doc in enumerate(text_list_no_cleanup):
        sent_tokens = tokenizer.tokenize_sentences(doc)
        wc_doc = [s for s in sent_tokens if len(s) > 1]
        word_count[i] = pd.Series(, index=['sentence_count'])
    df_word_count = pd.DataFrame(word_count).transpose()

    fp = os.path.expanduser('~/cltk_data/user_data/sentence_count_lens_latin.csv')
    df_word_count.to_csv(fp)
    print('Time to create doc word count: {}'.format(time.time() - t0))

sentence_count()


Time to create doc word count: 25.400434017181396

In [43]:
def word_len_counts():
    """Count words lengths in an input string (doc)."""
    t0 = time.time()
    p = PunktLanguageVars()
    word_counts = {}
    for i, doc in enumerate(text_list_no_cleanup):
        word_tokens = p.word_tokenize(doc)
        list_of_counts = ['word_len_' + str(len(w)) for w in word_tokens]
        counter_word_counts = Counter(list_of_counts)
        word_counts[i] = pd.Series(counter_word_counts, index=counter_word_counts.keys())
    df_word_count = pd.DataFrame(word_counts).transpose()

    fp = os.path.expanduser('~/cltk_data/user_data/word_count_lens_latin.csv')
    df_word_count.to_csv(fp)
    print('Time to create doc word count: {}'.format(time.time() - t0))

word_len_counts()


Time to create doc word count: 18.22161316871643

In [50]:
def sentence_word_count():
    """Count words lengths in an input string (doc)."""
    t0 = time.time()
    tokenizer_sent = TokenizeSentence('latin')
    p = PunktLanguageVars()
    word_counts = {}
    for i, doc in enumerate(text_list_no_cleanup):
        list_words_per_sentence = []
        sent_tokens = tokenizer_sent.tokenize_sentences(doc)
        sent_tokens = [s for s in sent_tokens if len(s) > 1]
        for sent in sent_tokens:
            word_tokens = p.word_tokenize(sent)
            words_in_sent = len(word_tokens)
            list_words_per_sentence.append(words_in_sent)
        list_of_counts = ['words_in_sent_' + str(count) for count in list_words_per_sentence]
        counter_word_counts_per_sents = Counter(list_of_counts)
        word_counts[i] = pd.Series(counter_word_counts_per_sents,
                                   index=counter_word_counts_per_sents.keys())
    df_word_count_per_sent = pd.DataFrame(word_counts).transpose()

    fp = os.path.expanduser('~/cltk_data/user_data/words_per_sent_latin.csv')
    df_word_count_per_sent.to_csv(fp)
    print('Time to create count of words per sentence: {}'.format(time.time() - t0))

sentence_word_count()


Time to create count of words per sentence: 35.22085189819336

In [ ]:
def pos_counts(index_start=0, index_break=99):
    """Count part of speech input string (doc)."""
    t0 = time.time()
    tokenizer_sent = TokenizeSentence('latin')
    pos_counts = {}
    tagger = POSTag('latin')
    for i, doc in enumerate(text_list_no_cleanup):
        i += index_start
        #if i % 1 == 0:
        print('Processing doc #{}'.format(i))
        pos_tags_list = []
        sent_tokens = tokenizer_sent.tokenize_sentences(doc)
        sent_tokens = [s for s in sent_tokens if len(s) > 1]
        for sent in sent_tokens:
            pos_tags = tagger.tag_tnt(sent.lower())
            pos_tags = [t[1] for t in pos_tags]
            pos_tags_list += pos_tags
        pos_counts_counter = Counter(pos_tags_list)
        pos_counts[i] = pd.Series(pos_counts_counter, index=pos_counts_counter.keys())
        
        if i == index_break:
            print('breaking …')
            break

    df_pos_counts = pd.DataFrame(pos_counts).transpose()

    fp = os.path.expanduser('~/cltk_data/user_data/pos_counts_latin_{}.csv'.format(index_start))
    df_pos_counts.to_csv(fp)
    print('Time to create count of words per sentence: {}'.format(time.time() - t0))

pos_counts(index_start=0, index_break=99)
#pos_counts(index_start=100)
#pos_counts(index_start=200)
#pos_counts(index_start=300)
#pos_counts(index_start=400)
#pos_counts(index_start=500)
#pos_counts(index_start=600)
#pos_counts(index_start=700)
#pos_counts(index_start=800)


Processing doc #0
Processing doc #1
Processing doc #2
Processing doc #3
Processing doc #4
Processing doc #5
Processing doc #6
Processing doc #7
Processing doc #8
Processing doc #9
Processing doc #10

In [ ]: